install packages

call libraries

## Package 'mclust' version 5.3
## Type 'citation("mclust")' for citing this R package in publications.
## Loading required package: lattice
## Loading required package: ggplot2
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## Loading required package: survival
## 
## Attaching package: 'survival'
## The following object is masked from 'package:caret':
## 
##     cluster
## Loading required package: splines
## Loading required package: parallel
## Loaded gbm 2.1.3
## Loading required package: BiocGenerics
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:dplyr':
## 
##     combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, cbind, colMeans,
##     colnames, colSums, do.call, duplicated, eval, evalq, Filter,
##     Find, get, grep, grepl, intersect, is.unsorted, lapply,
##     lengths, Map, mapply, match, mget, order, paste, pmax,
##     pmax.int, pmin, pmin.int, Position, rank, rbind, Reduce,
##     rowMeans, rownames, rowSums, sapply, setdiff, sort, table,
##     tapply, union, unique, unsplit, which, which.max, which.min
## Loading required package: S4Vectors
## Loading required package: stats4
## 
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:tidyr':
## 
##     expand
## The following objects are masked from 'package:dplyr':
## 
##     first, rename
## The following object is masked from 'package:plyr':
## 
##     rename
## The following object is masked from 'package:base':
## 
##     expand.grid
## Loading required package: IRanges
## 
## Attaching package: 'IRanges'
## The following objects are masked from 'package:dplyr':
## 
##     collapse, desc, slice
## The following object is masked from 'package:plyr':
## 
##     desc
## Loading required package: XVector
## 
## Attaching package: 'XVector'
## The following object is masked from 'package:plyr':
## 
##     compact
## 
## Attaching package: 'Biostrings'
## The following object is masked from 'package:base':
## 
##     strsplit
## 
## Attaching package: 'kknn'
## The following object is masked from 'package:caret':
## 
##     contr.dummy

Read data

#setwd("C:/Users/Ling/Documents/My Study/STAT5003/Assignment2/predicting-novel-kinase-substrates")

# load full data set
dt.Insulin <- read.delim("datasets/InsulinPhospho.txt")
#dim(dt.Insulin)

# load partially labeled data set
dt.Akt <- read.delim("datasets/Akt_substrates.txt", header = F)
dt.mTOR <- read.delim("datasets/mTOR_substrates.txt", header = F)   
#dim(dt.Akt)
#dim(dt.mTOR)

# map labled to the full data set 
dt.Insulin$Kinases[dt.Insulin$Identifier %in% dt.Akt$V1] <- "Akt"
dt.Insulin$Kinases[dt.Insulin$Identifier %in% dt.mTOR$V1] <- "mTOR"
unique(dt.Insulin$Kinases)
## [1] NA     "mTOR" "Akt"
# generate a subset of the labelled data 
dt.labeled <- dt.Insulin[which(dt.Insulin$Kinases == 'Akt' | dt.Insulin$Kinases == 'mTOR'), ]

load 2016 data

#setwd("C:/Users/Ling/Documents/My Study/STAT5003/Assignment2/predicting-novel-kinase-substrates")

# load prediction_2016 result

df.Akt <- read.csv(file="datasets/Akt_predictions2016.csv", header=TRUE, sep=",")
df.mTOR <- read.csv(file="datasets/mTOR_predictions2016.csv", header=TRUE, sep=",")
colnames(df.Akt)[1] <- "GeneSymbol"
colnames(df.mTOR)[1] <- "GeneSymbol"

df.Akt %>%
  mutate(Identifier = paste0(str_trim(str_to_upper(df.Akt$GeneSymbol), side="both"), ";",str_trim(df.Akt$Phosphorylation.site, side="both"), ";" )) %>%
  mutate(Full.model.predict = as.numeric(Full.model.predict))  -> df.Akt
names(df.Akt)[colnames(df.Akt)=="Full.model.predict"] <- "predictResult" 
head(df.Akt)
##   GeneSymbol Phosphorylation.site Sequence.window Uniprot.ID      IPI.ID
## 1       Irs2                  303   FRPRSKSQSSGSS     B9EJW3 IPI00923679
## 2     Cep131                  114   GRRRVASLSKASS     B1AXI9 IPI00889961
## 3      Ndrg3                  331   ARSRTHSTSSSIG     Q9QYF9 IPI00136107
## 4       Flnc                 2234   GRERLGSFGSITR   Q8VHX6-1 IPI00664670
## 5    Cables1                  272   RRCRTLSGSPRPK     B2RWU9 IPI00929838
## 6       Rtkn                  520   GRPRTFSLDAAPA   Q8C6B2-1 IPI00226220
##   predictResult Motif.predict Phosphoproteome.predict     Delta
## 1     0.9941443     0.9620853               0.9746454 0.5092316
## 2     0.9932284     0.7725118               0.9998626 0.4755840
## 3     0.9900757     0.8578199               0.9847889 0.4960437
## 4     0.9891778     0.7630332               0.9974299 0.4717430
## 5     0.9868492     0.8293839               0.9873219 0.4549283
## 6     0.9832485     0.8104265               0.9775198 0.5168305
##                          Uniprot.database   Identifier
## 1   http://www.uniprot.org/uniprot/B9EJW3    IRS2;303;
## 2   http://www.uniprot.org/uniprot/B1AXI9  CEP131;114;
## 3   http://www.uniprot.org/uniprot/Q9QYF9   NDRG3;331;
## 4 http://www.uniprot.org/uniprot/Q8VHX6-1   FLNC;2234;
## 5   http://www.uniprot.org/uniprot/B2RWU9 CABLES1;272;
## 6 http://www.uniprot.org/uniprot/Q8C6B2-1    RTKN;520;
df.mTOR %>%
  mutate(Identifier = paste0(str_trim(str_to_upper(df.mTOR$GeneSymbol), side="both"), ";",str_trim(df.mTOR$Phosphorylation.site, side="both"), ";" )) %>%
  mutate(Full.model.predict = as.numeric(Full.model.predict))  -> df.mTOR
names(df.mTOR)[colnames(df.mTOR)=="Full.model.predict"] <- "predictResult" 
head(df.mTOR)
##   GeneSymbol Phosphorylation.site Sequence.window Uniprot.ID      IPI.ID
## 1       Ulk2                  772   GRVCVGSPPGPGL     Q9QY01 IPI00336256
## 2      C2cd5                  295   NQTYSFSPSKSYS   Q7TPS5-1 IPI00408171
## 3       Ulk2                  781   GPGLGSSPPGAEG     Q9QY01 IPI00336256
## 4    Stk11ip                  444   LETMGSSPLSTTK     Q3TAA7 IPI00467059
## 5      Wdr91                  257   PASLSQSPRVGFL     Q7TMQ7 IPI00339693
## 6       Oxr1                   62   RPPSPASPEGPDT     Q8C715 IPI00654216
##   predictResult Motif.predict Phosphoproteome.predict     Delta
## 1     0.9931176     0.8024691               0.9872836 0.7504053
## 2     0.9872644     0.8024691               0.9721223 0.8133507
## 3     0.9856206     0.7901235               0.9772598 0.8175276
## 4     0.9828952     0.7654321               0.9779556 0.6570651
## 5     0.9813423     0.7283951               0.9961476 0.7576563
## 6     0.9733467     0.8765432               0.9498938 0.8432717
##                          Uniprot.database   Identifier
## 1   http://www.uniprot.org/uniprot/Q9QY01    ULK2;772;
## 2 http://www.uniprot.org/uniprot/Q7TPS5-1   C2CD5;295;
## 3   http://www.uniprot.org/uniprot/Q9QY01    ULK2;781;
## 4   http://www.uniprot.org/uniprot/Q3TAA7 STK11IP;444;
## 5   http://www.uniprot.org/uniprot/Q7TMQ7   WDR91;257;
## 6   http://www.uniprot.org/uniprot/Q8C715     OXR1;62;

summarize data

head(dt.Insulin)
##            Identifier    Seq.Window   Avg.Fold       AUC       X15s
## 1       100043914;88; GRHERRSSAEQHS  1.0315122 0.2967288 -0.1858515
## 2       100043914;89; RHERRSSAEQHSE  1.2160532 0.5279128  0.1077492
## 3   1110018J18RIK;18; CAGRVPSPAGSVT  0.2860675 0.5240430  0.7031150
## 4  1110037F02RIK;133; SVDRVISHDRDSP -0.4100882 0.6480379 -0.3962030
## 5  1110037F02RIK;138; ISHDRDSPPPPPP -0.3664219 0.5002719 -0.3962030
## 6 1110037F02RIK;1628; FLSEPSSPGRSKT -0.2148644 0.4370191 -0.2185709
##         X30s         X1m        X2m        X5m         X10m       X20m
## 1  0.5813547  1.88857193  2.1018999  1.8426771  2.287873789  1.3431367
## 2  0.2802415  1.24719923  2.0271668  2.6376657  2.374491409  1.1032924
## 3 -0.0905068  0.04348222  0.3846277  0.4637556  0.425594237  0.3769611
## 4 -0.4970602 -0.37816293 -0.6080681 -0.4343925 -0.445322740 -0.2649039
## 5 -0.2954442 -0.11986335 -0.2672015 -0.3476758 -0.283676544 -0.6626509
## 6 -0.2469075 -0.01866835 -0.3877707 -0.2848602  0.006343619 -0.1311892
##          X60m      Ins.1          LY      Ins.2          MK Kinases
## 1 -1.60756467  0.6526217  0.34193177  2.1570012  2.13792056    <NA>
## 2 -0.04938089  0.6112755  0.04620434  3.0963462  3.22683218    <NA>
## 3 -0.01848881 -0.1988757 -0.14777344 -0.2610874 -0.02748394    <NA>
## 4 -0.25659219  0.1735112 -0.57074291  0.3012657 -0.13305247    <NA>
## 5 -0.55865953  0.2292187 -0.38138545  0.4130244 -0.13305247    <NA>
## 6 -0.43729207  0.2987756 -0.45838604  0.1086583  0.02536763    <NA>
head(dt.labeled)
##         Identifier    Seq.Window  Avg.Fold       AUC         X15s
## 504    AHNAK;5536; VEAEASSPKGKFS 0.4263345 0.6567462 -0.176054689
## 640    AKT1S1;255; TQQYAKSLPVSVP 0.9899918 0.4360069 -0.007265314
## 646    AKT1S1;318; PRPRLNTSDFQKL 3.5063686 0.2030335  1.793249373
## 761  ANKRD17;2041; PVSSPSSPSPPAQ 0.6492043 0.5362401  0.009309093
## 904      APRF;727; TIDLPMSPRTLDS 0.5521512 0.5233701 -0.426227432
## 1215    AS160;324; FRSRCSSVTGVMQ 2.6444937 0.2217749  1.090046133
##             X30s        X1m        X2m       X5m     X10m     X20m
## 504  -0.21739065 -0.3346839 -0.4451926 0.2553367 1.046421 1.510936
## 640   0.16002791  0.5473494  1.0758795 1.5044226 1.501370 1.589304
## 646   2.94962673  3.6929821  3.6410960 4.0850147 3.898876 3.978123
## 761  -0.01560686  0.2071657  0.3408147 1.0057508 1.257341 1.197354
## 904  -0.96441078 -0.1944668 -0.2400350 1.3990019 1.448948 2.039807
## 1215  2.41608831  2.7808397  2.8150416 3.1520775 2.964196 2.874175
##          X60m    Ins.1         LY    Ins.2         MK Kinases
## 504  1.771303 1.758602  1.2239168 2.176801  1.7279205    mTOR
## 640  1.548847 1.228480 -1.4821297 1.369718 -0.3435128    mTOR
## 646  4.011981 2.443501  0.9018806 2.615299 -0.8495963     Akt
## 761  1.191506 1.030689  0.1924467 0.384713  0.3352992    mTOR
## 904  1.354592 1.835358  1.1196218 1.994797  2.1098954    mTOR
## 1215 3.063486 2.561888  0.3589588 2.661638 -0.6009274     Akt
# remove the identifier column since it is just a phosphorilation site  
dt.labeled$Identifier <- NULL
dt.labeled$Kinases <- as.factor(dt.labeled$Kinases) # only two classes so set as a factor
dt.labeled$Kinases <- as.numeric(dt.labeled$Kinases) # only two classes so set as a factor
dt.labeled$Seq.Window <- as.character(dt.labeled$Seq.Window) # convert from factor to character
head(dt.labeled)
##         Seq.Window  Avg.Fold       AUC         X15s        X30s        X1m
## 504  VEAEASSPKGKFS 0.4263345 0.6567462 -0.176054689 -0.21739065 -0.3346839
## 640  TQQYAKSLPVSVP 0.9899918 0.4360069 -0.007265314  0.16002791  0.5473494
## 646  PRPRLNTSDFQKL 3.5063686 0.2030335  1.793249373  2.94962673  3.6929821
## 761  PVSSPSSPSPPAQ 0.6492043 0.5362401  0.009309093 -0.01560686  0.2071657
## 904  TIDLPMSPRTLDS 0.5521512 0.5233701 -0.426227432 -0.96441078 -0.1944668
## 1215 FRSRCSSVTGVMQ 2.6444937 0.2217749  1.090046133  2.41608831  2.7808397
##             X2m       X5m     X10m     X20m     X60m    Ins.1         LY
## 504  -0.4451926 0.2553367 1.046421 1.510936 1.771303 1.758602  1.2239168
## 640   1.0758795 1.5044226 1.501370 1.589304 1.548847 1.228480 -1.4821297
## 646   3.6410960 4.0850147 3.898876 3.978123 4.011981 2.443501  0.9018806
## 761   0.3408147 1.0057508 1.257341 1.197354 1.191506 1.030689  0.1924467
## 904  -0.2400350 1.3990019 1.448948 2.039807 1.354592 1.835358  1.1196218
## 1215  2.8150416 3.1520775 2.964196 2.874175 3.063486 2.561888  0.3589588
##         Ins.2         MK Kinases
## 504  2.176801  1.7279205       2
## 640  1.369718 -0.3435128       2
## 646  2.615299 -0.8495963       1
## 761  0.384713  0.3352992       2
## 904  1.994797  2.1098954       2
## 1215 2.661638 -0.6009274       1
str(dt.labeled)
## 'data.frame':    48 obs. of  16 variables:
##  $ Seq.Window: chr  "VEAEASSPKGKFS" "TQQYAKSLPVSVP" "PRPRLNTSDFQKL" "PVSSPSSPSPPAQ" ...
##  $ Avg.Fold  : num  0.426 0.99 3.506 0.649 0.552 ...
##  $ AUC       : num  0.657 0.436 0.203 0.536 0.523 ...
##  $ X15s      : num  -0.17605 -0.00727 1.79325 0.00931 -0.42623 ...
##  $ X30s      : num  -0.2174 0.16 2.9496 -0.0156 -0.9644 ...
##  $ X1m       : num  -0.335 0.547 3.693 0.207 -0.194 ...
##  $ X2m       : num  -0.445 1.076 3.641 0.341 -0.24 ...
##  $ X5m       : num  0.255 1.504 4.085 1.006 1.399 ...
##  $ X10m      : num  1.05 1.5 3.9 1.26 1.45 ...
##  $ X20m      : num  1.51 1.59 3.98 1.2 2.04 ...
##  $ X60m      : num  1.77 1.55 4.01 1.19 1.35 ...
##  $ Ins.1     : num  1.76 1.23 2.44 1.03 1.84 ...
##  $ LY        : num  1.224 -1.482 0.902 0.192 1.12 ...
##  $ Ins.2     : num  2.177 1.37 2.615 0.385 1.995 ...
##  $ MK        : num  1.728 -0.344 -0.85 0.335 2.11 ...
##  $ Kinases   : num  2 2 1 2 2 1 1 1 1 1 ...

Analysize data

Attempt to do a pairwise plot of the features to understand if the labelled data is separable

#install.packages("lattice")
#install.packages("ggplot2")

filteredFeatures <- c("Avg.Fold","AUC","Ins.1","Ins.2","LY","MK","Kinases")
temporalFeatures <- c("X15s","X30s","X1m","X2m","X5m","X10m","X20m","X60m","Kinases")

# try plotting the temporal data 
pairs(Kinases~., dt.labeled[,temporalFeatures], col=dt.labeled[,temporalFeatures]$Kinases)

# try plotting the other features 
# pair-wise scatterplots colored by class
pairs(Kinases~., dt.labeled[,filteredFeatures], col=dt.labeled[,filteredFeatures]$Kinases)

print("Plots appear to indicate that the labeled data can be separated. Thinking SVM should work.") 
## [1] "Plots appear to indicate that the labeled data can be separated. Thinking SVM should work."

seq.window pattern transformation

visualise the patterns

# visualise the patterns
library(RColorBrewer)

par(mfrow=c(1, 1), mar=c(3, 3, 3, 3) + 0.1)
barplot(aktSequences.pfm, col=brewer.pal(nrow(aktSequences.pfm), "Paired"),
            legend.text = rownames(aktSequences.pfm),
            args.legend=list(x=ncol(aktSequences.pfm) + 5,y=max(colSums(aktSequences.pfm))+4,bty = "n"),
            main="Akt Sequence Window patterns")

par(mfrow=c(1, 1), mar=c(3, 3, 3, 3) + 0.1)
barplot(mTORSequences.pfm, col=brewer.pal(nrow(mTORSequences.pfm), "Paired"),
            legend.text = rownames(mTORSequences.pfm),
            args.legend=list(x=ncol(mTORSequences.pfm)+5,y=max(colSums(mTORSequences.pfm))+4,bty = "n"),
            main="mTOR Sequence Window patterns")

pattern match score

# Pattern match score
# * Basic Formula  
#   + With the probabbility matrix (PSSM) it is possible to calculate a total score for a specific sequence
#   + The higher the score is, the higher is the probability that the sequence contains the searched motif.
#   + Convert the sequence window in the unlabelled dataset to a sum based on the Position-specific Scoring Matrix (PSSM)

library(seqLogo)
## Loading required package: grid
aktSequences.pfm <- consensusMatrix(phosphoSites.Akt.allSequences, as.prob = T)
colnames(aktSequences.pfm) <- colnames(aktSequences)

mTORSequences.pfm <- consensusMatrix(phosphoSites.mTOR.allSequences, as.prob = T) 
colnames(mTORSequences.pfm) <- colnames(mTORSequences)

# motif calculations function 
getMatchScore <- function(seq, PSSM) {
  x <- strsplit(x=seq,split='')
  #x
  #initialise vector to keep scores
  seq_score <- vector()
  #get the corresponding values from the PSSM
  for (i in 1:nchar(seq)){
    if (x[[1]][i] != "_") {
      seq_score[i] <- PSSM[x[[1]][i],i]
    }
    else seq_score[i] = 0.00
  }
  #seq_score
  sum(seq_score)
   
  #max score
  #sum(apply(mm,2,max))
  
}

getPercentageMatch <- function (score, max) {
  # normalise score to 0-1
  score / max
}

dt.Insulin$AktMotif <- as.numeric(lapply(dt.Insulin$Seq.Window, getMatchScore, PSSM=aktSequences.pfm))
dt.Insulin$mTORMotif <- as.numeric(lapply(dt.Insulin$Seq.Window, getMatchScore, PSSM=mTORSequences.pfm))

maxMotifAkt <- max(dt.Insulin$AktMotif)
minMotifAkt <- min(dt.Insulin$AktMotif)
maxMotifmTOR <- max(dt.Insulin$mTORMotif)
minMotifmTOR <- min(dt.Insulin$mTORMotif)

# normalised score
dt.Insulin$aktMotifMatch <- as.numeric(lapply(dt.Insulin$AktMotif, getPercentageMatch, max=maxMotifAkt))
dt.Insulin$mTORMotifMatch <- as.numeric(lapply(dt.Insulin$mTORMotif, getPercentageMatch, max=maxMotifmTOR))

summary of the new features motif

#View(dt.unlabeled)
head(dt.Insulin)
##            Identifier    Seq.Window   Avg.Fold       AUC       X15s
## 1       100043914;88; GRHERRSSAEQHS  1.0315122 0.2967288 -0.1858515
## 2       100043914;89; RHERRSSAEQHSE  1.2160532 0.5279128  0.1077492
## 3   1110018J18RIK;18; CAGRVPSPAGSVT  0.2860675 0.5240430  0.7031150
## 4  1110037F02RIK;133; SVDRVISHDRDSP -0.4100882 0.6480379 -0.3962030
## 5  1110037F02RIK;138; ISHDRDSPPPPPP -0.3664219 0.5002719 -0.3962030
## 6 1110037F02RIK;1628; FLSEPSSPGRSKT -0.2148644 0.4370191 -0.2185709
##         X30s         X1m        X2m        X5m         X10m       X20m
## 1  0.5813547  1.88857193  2.1018999  1.8426771  2.287873789  1.3431367
## 2  0.2802415  1.24719923  2.0271668  2.6376657  2.374491409  1.1032924
## 3 -0.0905068  0.04348222  0.3846277  0.4637556  0.425594237  0.3769611
## 4 -0.4970602 -0.37816293 -0.6080681 -0.4343925 -0.445322740 -0.2649039
## 5 -0.2954442 -0.11986335 -0.2672015 -0.3476758 -0.283676544 -0.6626509
## 6 -0.2469075 -0.01866835 -0.3877707 -0.2848602  0.006343619 -0.1311892
##          X60m      Ins.1          LY      Ins.2          MK Kinases
## 1 -1.60756467  0.6526217  0.34193177  2.1570012  2.13792056    <NA>
## 2 -0.04938089  0.6112755  0.04620434  3.0963462  3.22683218    <NA>
## 3 -0.01848881 -0.1988757 -0.14777344 -0.2610874 -0.02748394    <NA>
## 4 -0.25659219  0.1735112 -0.57074291  0.3012657 -0.13305247    <NA>
## 5 -0.55865953  0.2292187 -0.38138545  0.4130244 -0.13305247    <NA>
## 6 -0.43729207  0.2987756 -0.45838604  0.1086583  0.02536763    <NA>
##   AktMotif mTORMotif aktMotifMatch mTORMotifMatch
## 1 2.636364  1.192308     0.5321101      0.3522727
## 2 2.545455  1.576923     0.5137615      0.4659091
## 3 2.590909  2.384615     0.5229358      0.7045455
## 4 2.272727  1.615385     0.4587156      0.4772727
## 5 1.363636  2.346154     0.2752294      0.6931818
## 6 2.045455  2.769231     0.4128440      0.8181818
unique(dt.Insulin$Kinases)
## [1] NA     "mTOR" "Akt"
max(dt.Insulin$AktMotif)
## [1] 4.954545
min(dt.Insulin$AktMotif)
## [1] 0.04545455
max(dt.Insulin$mTORMotif)
## [1] 3.384615
min(dt.Insulin$mTORMotif)
## [1] 0.1153846
dt.labeled.mTOR <-  dt.Insulin[which(dt.Insulin$Kinases == 'mTOR'), ]
dt.labeled.Akt <- dt.Insulin[which(dt.Insulin$Kinases == 'Akt'), ]

feature extraction

# this part reference PengYi's code - KSP-PUEL GUI feature extraction ....
str(dt.Insulin)
## 'data.frame':    12062 obs. of  21 variables:
##  $ Identifier    : Factor w/ 12062 levels "100043914;88;",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ Seq.Window    : chr  "GRHERRSSAEQHS" "RHERRSSAEQHSE" "CAGRVPSPAGSVT" "SVDRVISHDRDSP" ...
##  $ Avg.Fold      : num  1.032 1.216 0.286 -0.41 -0.366 ...
##  $ AUC           : num  0.297 0.528 0.524 0.648 0.5 ...
##  $ X15s          : num  -0.186 0.108 0.703 -0.396 -0.396 ...
##  $ X30s          : num  0.5814 0.2802 -0.0905 -0.4971 -0.2954 ...
##  $ X1m           : num  1.8886 1.2472 0.0435 -0.3782 -0.1199 ...
##  $ X2m           : num  2.102 2.027 0.385 -0.608 -0.267 ...
##  $ X5m           : num  1.843 2.638 0.464 -0.434 -0.348 ...
##  $ X10m          : num  2.288 2.374 0.426 -0.445 -0.284 ...
##  $ X20m          : num  1.343 1.103 0.377 -0.265 -0.663 ...
##  $ X60m          : num  -1.6076 -0.0494 -0.0185 -0.2566 -0.5587 ...
##  $ Ins.1         : num  0.653 0.611 -0.199 0.174 0.229 ...
##  $ LY            : num  0.3419 0.0462 -0.1478 -0.5707 -0.3814 ...
##  $ Ins.2         : num  2.157 3.096 -0.261 0.301 0.413 ...
##  $ MK            : num  2.1379 3.2268 -0.0275 -0.1331 -0.1331 ...
##  $ Kinases       : chr  NA NA NA NA ...
##  $ AktMotif      : num  2.64 2.55 2.59 2.27 1.36 ...
##  $ mTORMotif     : num  1.19 1.58 2.38 1.62 2.35 ...
##  $ aktMotifMatch : num  0.532 0.514 0.523 0.459 0.275 ...
##  $ mTORMotifMatch: num  0.352 0.466 0.705 0.477 0.693 ...
dt.Insulin.feat <- dt.Insulin[, c("Identifier", "X15s", "X30s", "X1m", "X2m", "X5m", "X10m", "X20m", "X60m")]
dt.Insulin.feat$X0s <- 0 
head(dt.Insulin.feat)
##            Identifier       X15s       X30s         X1m        X2m
## 1       100043914;88; -0.1858515  0.5813547  1.88857193  2.1018999
## 2       100043914;89;  0.1077492  0.2802415  1.24719923  2.0271668
## 3   1110018J18RIK;18;  0.7031150 -0.0905068  0.04348222  0.3846277
## 4  1110037F02RIK;133; -0.3962030 -0.4970602 -0.37816293 -0.6080681
## 5  1110037F02RIK;138; -0.3962030 -0.2954442 -0.11986335 -0.2672015
## 6 1110037F02RIK;1628; -0.2185709 -0.2469075 -0.01866835 -0.3877707
##          X5m         X10m       X20m        X60m X0s
## 1  1.8426771  2.287873789  1.3431367 -1.60756467   0
## 2  2.6376657  2.374491409  1.1032924 -0.04938089   0
## 3  0.4637556  0.425594237  0.3769611 -0.01848881   0
## 4 -0.4343925 -0.445322740 -0.2649039 -0.25659219   0
## 5 -0.3476758 -0.283676544 -0.6626509 -0.55865953   0
## 6 -0.2848602  0.006343619 -0.1311892 -0.43729207   0
#### secondary feature 1
# magnitude calculation (mathematical mean)
average.score <- rowSums(dt.Insulin.feat[, 2:9]) / ncol(dt.Insulin.feat[, 2:9])
names(average.score) <- rownames(dt.Insulin.feat)
head(average.score)
##          1          2          3          4          5          6 
##  1.0315122  1.2160532  0.2860675 -0.4100882 -0.3664219 -0.2148644
#### secondary feature 2
# temporal profile fitting to check if the profile follows are good trend
fitting.score <- c()
for (i in 1:nrow(dt.Insulin)) {
   y <- as.numeric(dt.Insulin[i, 2:10]);
   x <- 2:10
   x2 = x^2
   lmfit <- lm(formula = y ~ x + x2 - 1)
   f.stat <- summary(lmfit)$fstatistic
   fitting.score <- c(fitting.score, f.stat[1])
}
fitted.score <- log2(fitting.score)
names(fitted.score) <- rownames(dt.Insulin.feat)

# combine extracted secondary features with primary features
dt.Insulin.feat <- cbind(fitted.score, dt.Insulin.feat) # removed avg score since same as avg fold
head(dt.Insulin.feat)
##   fitted.score          Identifier       X15s       X30s         X1m
## 1    4.4639660       100043914;88; -0.1858515  0.5813547  1.88857193
## 2    4.6421890       100043914;89;  0.1077492  0.2802415  1.24719923
## 3    2.3391666   1110018J18RIK;18;  0.7031150 -0.0905068  0.04348222
## 4    1.7749557  1110037F02RIK;133; -0.3962030 -0.4970602 -0.37816293
## 5    1.0122698  1110037F02RIK;138; -0.3962030 -0.2954442 -0.11986335
## 6   -0.1060349 1110037F02RIK;1628; -0.2185709 -0.2469075 -0.01866835
##          X2m        X5m         X10m       X20m        X60m X0s
## 1  2.1018999  1.8426771  2.287873789  1.3431367 -1.60756467   0
## 2  2.0271668  2.6376657  2.374491409  1.1032924 -0.04938089   0
## 3  0.3846277  0.4637556  0.425594237  0.3769611 -0.01848881   0
## 4 -0.6080681 -0.4343925 -0.445322740 -0.2649039 -0.25659219   0
## 5 -0.2672015 -0.3476758 -0.283676544 -0.6626509 -0.55865953   0
## 6 -0.3877707 -0.2848602  0.006343619 -0.1311892 -0.43729207   0
dt.Insulin <- merge(dt.Insulin, dt.Insulin.feat, by=c("Identifier", "X15s", "X30s", "X1m", "X2m", "X5m", "X10m", "X20m", "X60m"))
dt.Insulin$X0s <- NULL
head(dt.Insulin)
##            Identifier       X15s       X30s         X1m        X2m
## 1       100043914;88; -0.1858515  0.5813547  1.88857193  2.1018999
## 2       100043914;89;  0.1077492  0.2802415  1.24719923  2.0271668
## 3   1110018J18RIK;18;  0.7031150 -0.0905068  0.04348222  0.3846277
## 4  1110037F02RIK;133; -0.3962030 -0.4970602 -0.37816293 -0.6080681
## 5  1110037F02RIK;138; -0.3962030 -0.2954442 -0.11986335 -0.2672015
## 6 1110037F02RIK;1628; -0.2185709 -0.2469075 -0.01866835 -0.3877707
##          X5m         X10m       X20m        X60m    Seq.Window   Avg.Fold
## 1  1.8426771  2.287873789  1.3431367 -1.60756467 GRHERRSSAEQHS  1.0315122
## 2  2.6376657  2.374491409  1.1032924 -0.04938089 RHERRSSAEQHSE  1.2160532
## 3  0.4637556  0.425594237  0.3769611 -0.01848881 CAGRVPSPAGSVT  0.2860675
## 4 -0.4343925 -0.445322740 -0.2649039 -0.25659219 SVDRVISHDRDSP -0.4100882
## 5 -0.3476758 -0.283676544 -0.6626509 -0.55865953 ISHDRDSPPPPPP -0.3664219
## 6 -0.2848602  0.006343619 -0.1311892 -0.43729207 FLSEPSSPGRSKT -0.2148644
##         AUC      Ins.1          LY      Ins.2          MK Kinases AktMotif
## 1 0.2967288  0.6526217  0.34193177  2.1570012  2.13792056    <NA> 2.636364
## 2 0.5279128  0.6112755  0.04620434  3.0963462  3.22683218    <NA> 2.545455
## 3 0.5240430 -0.1988757 -0.14777344 -0.2610874 -0.02748394    <NA> 2.590909
## 4 0.6480379  0.1735112 -0.57074291  0.3012657 -0.13305247    <NA> 2.272727
## 5 0.5002719  0.2292187 -0.38138545  0.4130244 -0.13305247    <NA> 1.363636
## 6 0.4370191  0.2987756 -0.45838604  0.1086583  0.02536763    <NA> 2.045455
##   mTORMotif aktMotifMatch mTORMotifMatch fitted.score
## 1  1.192308     0.5321101      0.3522727    4.4639660
## 2  1.576923     0.5137615      0.4659091    4.6421890
## 3  2.384615     0.5229358      0.7045455    2.3391666
## 4  1.615385     0.4587156      0.4772727    1.7749557
## 5  2.346154     0.2752294      0.6931818    1.0122698
## 6  2.769231     0.4128440      0.8181818   -0.1060349
# average fold and average score are the same. already provided

model analysis and select the best fit model

setup

# total 16 features approx
predictorsAkt <- c("X15s", "X30s", "X1m", "X2m", "X5m", "X10m", "X20m", "X60m", "Avg.Fold", "AUC", "Ins.1", "LY", "Ins.2", "MK", "fitted.score","aktMotifMatch","Kinases")

predictorsmTOR <- c("X15s", "X30s", "X1m", "X2m", "X5m", "X10m", "X20m", "X60m", "Avg.Fold", "AUC", "Ins.1", "LY", "Ins.2", "MK", "fitted.score","mTORMotifMatch","Kinases")

dt.labeled.mTOR <-  dt.Insulin[which(dt.Insulin$Kinases == 'mTOR'), ]
dt.labeled.Akt <- dt.Insulin[which(dt.Insulin$Kinases == 'Akt'), ]
dt.unlabelled <- dt.Insulin[which(is.na(dt.Insulin$Kinases)), ]

dt.feature.selection.akt <- dt.labeled.Akt[,predictorsAkt]
dt.feature.selection.akt$Kinases <- 1
dt.feature.selection.mTOR <- dt.labeled.mTOR[,predictorsmTOR]
dt.feature.selection.mTOR$Kinases <- 1

dim(dt.feature.selection.akt)
## [1] 22 17
dim(dt.feature.selection.mTOR)
## [1] 26 17
# pick 22 random for akt negatives
set.seed(55)
aktNeg <- sample(x=1:nrow(dt.unlabelled), size=nrow(dt.feature.selection.akt))
temp <- dt.unlabelled[aktNeg, predictorsAkt]
temp$Kinases <- -1
dt.feature.selection.akt <- rbind(dt.feature.selection.akt, temp)
temp <- NULL

# pick 26 random mtor negatives
set.seed(65)
mtorNeg <- sample(x=1:nrow(dt.unlabelled), size=nrow(dt.feature.selection.mTOR))
temp <- dt.unlabelled[mtorNeg, predictorsmTOR]
temp$Kinases <- -1
dt.feature.selection.mTOR <- rbind(dt.feature.selection.mTOR, temp)
temp <- NULL

# final sets for feature selection exercise
dt.feature.selection.akt$Kinases <- as.factor(dt.feature.selection.akt$Kinases)
dt.feature.selection.mTOR$Kinases <- as.factor(dt.feature.selection.mTOR$Kinases)
colnames(dt.feature.selection.akt)[17] <- "Class"
colnames(dt.feature.selection.mTOR)[17] <- "Class"

dim(dt.feature.selection.akt)
## [1] 44 17
dim(dt.feature.selection.mTOR)
## [1] 52 17
# function for model evaluation 

sampleData <- function (positives, featureSpace) {
      colnames(positives)[17] <- "Kinases" # change col name 
      featureSpace <- colnames(positives)
      #set.seed(33)
      randomsamples <- sample(nrow(dt.unlabelled), size=nrow(positives), replace = FALSE)
      #print(randomsamples)
      #print(randomsamples)
      negatives <- dt.unlabelled[randomsamples, featureSpace]
      negatives$Kinases <- -1
      # make a balanced set
      balanced <- rbind(positives, negatives)
      colnames(balanced)[17] <- "Class" # change col name
      return(balanced)
}

AnalyseModels <- function (method, data) {
  # prepare training scheme utilising LOOCV
  control <- trainControl(method="repeatedcv", number=10, repeats=3)
  controlLOOCV <- trainControl(method="LOOCV")
  
  model <- NULL
  
  #set.seed(5)
  switch(method, 
  kknn={
    model <- train(Class~., data=data, method=method, trControl=controlLOOCV)
  },
  svmRadial={
    tuneGrid <- expand.grid(sigma=seq(0.1,1,0.1),C=seq(0.1,5,0.5))
    model <- train(Class~., data=data, method=method, trControl=controlLOOCV,tuneGrid=tuneGrid)
  },
  svmLinear={
    tuneGrid <- expand.grid(C=seq(0.1,5,0.5))
    model <- train(Class~., data=data, method=method, trControl=controlLOOCV, tuneGrid=tuneGrid)
  },
  svmPoly={
    tuneGrid <- expand.grid(degree=(1:5),scale=seq(0.1,1,0.1),C=seq(0.1,5,0.5))
    model <- train(Class~., data=data, method=method, trControl=controlLOOCV, tuneGrid=tuneGrid)
  },
  {
    model <- train(Class~., data=data, method=method, trControl=controlLOOCV)
  }
  )
  #model <- train(Class~., data=data, method=method, trControl=controlLOOCV)
  print(model$bestTune)
  # resample data for testing and keep the same positives
  train <- sampleData(positives=data[which(data$Class == 1),])
  
  # predict on training set 
  pred <- predict(model$finalModel, train[,-17])
  if (method=="lda") {
    rocModel <- roc(train[, 17], as.numeric(pred$class))  
  }
  else {
    rocModel <- roc(train[, 17], as.numeric(pred))  
  }
  
  return(rocModel)
}

run through some models to pick best one for AKT

library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:IRanges':
## 
##     cov, var
## The following objects are masked from 'package:S4Vectors':
## 
##     cov, var
## The following object is masked from 'package:BiocGenerics':
## 
##     var
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
#model <- train(Class~., data=dt.feature.selection.akt, method="svmRadial", trControl=controlLOOCV)

# analyze each model
rocKnn <- AnalyseModels("kknn",dt.feature.selection.akt) 
##   kmax distance  kernel
## 3    9        2 optimal
rocSvmR <- AnalyseModels("svmRadial",dt.feature.selection.akt)
## 
## Attaching package: 'kernlab'
## The following object is masked from 'package:Biostrings':
## 
##     type
## The following object is masked from 'package:scales':
## 
##     alpha
## The following object is masked from 'package:ggplot2':
## 
##     alpha
##    sigma   C
## 93     1 1.1
rocSvmL <- AnalyseModels("svmLinear",dt.feature.selection.akt)
##     C
## 2 0.6
rocSvmP <- AnalyseModels("svmPoly",dt.feature.selection.akt)
##    degree scale   C
## 22      1   0.3 0.6
rocLda <- AnalyseModels("lda",dt.feature.selection.akt)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
##   parameter
## 1      none
#modelGbm <- AnalyseModels("gbm",dt.feature.selection.akt)

plot(rocKnn, legacy.axes = TRUE, col="red", lty=3, main="Akt model performances")
lines(rocSvmR, col="blue", lty=2)
lines(rocSvmL, col="green3", lty=1)
lines(rocSvmP, col="orange", lty=2)
lines(rocLda, col="purple", lty=3)

legend("bottomright", inset = .05, legend=c("KNN", "SVM Radial", "SVM Linear", "SVM Poly", "LDA"), col = c("red", "blue", "green3", "orange","purple"), lty=c(3, 2, 1, 2, 3))

run through some models to pick best one for mTOR

# analyze each model
rocKnn <- AnalyseModels("kknn",dt.feature.selection.mTOR)
##   kmax distance  kernel
## 3    9        2 optimal
rocSvmR <- AnalyseModels("svmRadial",dt.feature.selection.mTOR)
##    sigma   C
## 93     1 1.1
rocSvmL <- AnalyseModels("svmLinear",dt.feature.selection.mTOR)
##     C
## 1 0.1
rocSvmP <- AnalyseModels("svmPoly",dt.feature.selection.mTOR)
##    degree scale   C
## 12      1   0.2 0.6
rocLda <- AnalyseModels("lda",dt.feature.selection.mTOR)
##   parameter
## 1      none
#modelGbm <- AnalyseModels("gbm",dt.feature.selection.akt)


plot(rocKnn, legacy.axes = TRUE, col="red", lty=3, main="mTOR model performances")
lines(rocSvmR, col="blue", lty=2)
lines(rocSvmL, col="green3", lty=1)
lines(rocSvmP, col="orange", lty=2)
lines(rocLda, col="purple", lty=3)

legend("bottomright", inset = .05, legend=c("KNN", "SVM Radial", "SVM Linear", "SVM Poly", "LDA"), col = c("red", "blue", "green3", "orange","purple"), lty=c(3, 2, 1, 2, 3))

Model analysis results

print("SVM Radial yielded a the best overall result although the ROC curve indicates otherwise on the small subset")  
## [1] "SVM Radial yielded a the best overall result although the ROC curve indicates otherwise on the small subset"
print("Best Model tuning parameters - AKT")
## [1] "Best Model tuning parameters - AKT"
print("SVM Radial - Sigma: 1, C: 1.1")
## [1] "SVM Radial - Sigma: 1, C: 1.1"
print("Best Model tuning parameters - mTOR")
## [1] "Best Model tuning parameters - mTOR"
print("SVM Radial - Sigma: 1, C: 1.1")
## [1] "SVM Radial - Sigma: 1, C: 1.1"

feature selection

setup

library(mlbench)
library(caret)

# total 16 features approx
predictorsAkt <- c("X15s", "X30s", "X1m", "X2m", "X5m", "X10m", "X20m", "X60m", "Avg.Fold", "AUC", "Ins.1", "LY", "Ins.2", "MK", "fitted.score","aktMotifMatch","Kinases")

predictorsmTOR <- c("X15s", "X30s", "X1m", "X2m", "X5m", "X10m", "X20m", "X60m", "Avg.Fold", "AUC", "Ins.1", "LY", "Ins.2", "MK", "fitted.score","mTORMotifMatch","Kinases")

dt.labeled.mTOR <-  dt.Insulin[which(dt.Insulin$Kinases == 'mTOR'), ]
dt.labeled.Akt <- dt.Insulin[which(dt.Insulin$Kinases == 'Akt'), ]
dt.unlabelled <- dt.Insulin[which(is.na(dt.Insulin$Kinases)), ]

dt.feature.selection.akt <- dt.labeled.Akt[,predictorsAkt]
dt.feature.selection.akt$Kinases <- 1
dt.feature.selection.mTOR <- dt.labeled.mTOR[,predictorsmTOR]
dt.feature.selection.mTOR$Kinases <- 1

dim(dt.feature.selection.akt)
## [1] 22 17
dim(dt.feature.selection.mTOR)
## [1] 26 17
# pick 22 random for akt negatives
aktNeg <- sample(x=1:nrow(dt.unlabelled), size=nrow(dt.feature.selection.akt))
temp <- dt.unlabelled[aktNeg, predictorsAkt]
temp$Kinases <- -1
dt.feature.selection.akt <- rbind(dt.feature.selection.akt, temp)
temp <- NULL

# pick 26 random mtor negatives
mtorNeg <- sample(x=1:nrow(dt.unlabelled), size=nrow(dt.feature.selection.mTOR))
temp <- dt.unlabelled[mtorNeg, predictorsmTOR]
temp$Kinases <- -1
dt.feature.selection.mTOR <- rbind(dt.feature.selection.mTOR, temp)
temp <- NULL

# final sets for feature selection exercise
dim(dt.feature.selection.akt)
## [1] 44 17
dim(dt.feature.selection.mTOR)
## [1] 52 17

functions for feature selection

tstatSort <- function (train) {
  train.byClass <- split(train[,-17], train$Kinases)

  # perform a t-test
  feature.pvalues <- c()
  for(i in 1:(ncol(train)-1)) {
    feature.pvalues <- c(feature.pvalues, t.test(train.byClass[[1]][,i], train.byClass[[2]][,i])$p.value)
  }
  names(feature.pvalues) <- colnames(train[,-17])
  
  # filter the top most discriminative feature based on p-values
  filtered.features <- names(sort(feature.pvalues)) #[1:10])
  filtered.features
}

backwardStepwise <- function(train, cls.train, sortedFeatures) {

  test.acc <- c() 
  
  # carry out a LOOCV
  set.seed(99)
  fold <- createFolds(cls.train, k=nrow(train))
  
  # remove the least useful predictor one at a time
  for (i in length(sortedFeatures):1) {
      current.f <- colnames(train)[i]
      features <- sortedFeatures[1:i]
      #print(features)
      
    test.accuracies <- c()
      
      for(i in 1:length(fold)) {
        truth <- cls.train[fold[[i]]]
        trainingData <- train[-fold[[i]], features]
        testingData <- train[fold[[i]],features]
        #print(truth)
        svm.model <- svm(x=trainingData, y=cls.train[-fold[[i]]], kernel="radial", type="C-classification")
        pred <- predict(svm.model, testingData)
        #print(pred)
        accuracy <- sum(pred == truth) / length(truth)
        test.accuracies <- c(test.accuracies, accuracy)
      }
      
      # take the avg test accuracy as benchmark 
    test.acc <- c(test.acc, mean(test.accuracies))
      
  }
  
  return(test.acc)
}

# run the random sampling XX times to see if the result is different
randomAccuracies <- function(positives, featureSpace, sortedFeatures, iterations) {
  accuracies.all <- data.frame()
  #names(accuracies.all) <- seq(length(sortedFeatures), 1)
  for (i in 1:iterations) {
    # pick 22 random for negatives
    set.seed(i)
    randomsamples <- sample(nrow(dt.unlabelled), size=nrow(positives), replace = FALSE)
    #print(randomsamples)
    negatives <- dt.unlabelled[randomsamples, featureSpace]
    negatives$Kinases <- -1
    # make a balanced set
    balanced <- rbind(positives, negatives)
    # run the wrapper selection process
    accuracies <- backwardStepwise(train = balanced[,-17],
                                        cls.train = balanced[,17],
                                        sortedFeatures = sortedFeatures)
    #accuracies <- data.frame(accuracies)
    #names(accuracies) <- seq(length(sortedFeatures), 1)
    accuracies.all <- rbind(accuracies.all, accuracies)
  }
  names(accuracies.all) <- seq(length(sortedFeatures), 1)
  return(accuracies.all)
}

run the wrapper feature selection (SVM)

# carry out t-test to sort features
featuresAkt <- tstatSort(dt.feature.selection.akt)
# sorted features 
featuresAkt
##  [1] "aktMotifMatch" "Avg.Fold"      "X2m"           "X10m"         
##  [5] "X5m"           "X1m"           "X30s"          "X60m"         
##  [9] "X20m"          "AUC"           "Ins.1"         "fitted.score" 
## [13] "Ins.2"         "LY"            "X15s"          "MK"
# best feature as per t-test
featuresAkt[1]
## [1] "aktMotifMatch"
# carry out t-test to sort features
featuresmTOR <- tstatSort(dt.feature.selection.mTOR)
# sorted features 
featuresmTOR
##  [1] "mTORMotifMatch" "X20m"           "Ins.2"          "X10m"          
##  [5] "X60m"           "Ins.1"          "Avg.Fold"       "X5m"           
##  [9] "MK"             "X2m"            "LY"             "AUC"           
## [13] "fitted.score"   "X1m"            "X15s"           "X30s"
# best feature as per t-test
featuresmTOR[1]
## [1] "mTORMotifMatch"

backward stepwise

akt feature selection with random sampling, LOOCV

# reset the positives
dt.feature.selection.akt <- dt.labeled.Akt[,predictorsAkt]
dt.feature.selection.akt$Kinases <- 1
dt.feature.selection.mTOR <- dt.labeled.mTOR[,predictorsmTOR]
dt.feature.selection.mTOR$Kinases <- 1

test.accuracies.akt <- randomAccuracies(positives = dt.feature.selection.akt, 
                                        featureSpace = predictorsAkt,
                                        sortedFeatures = featuresAkt,
                                        iterations = 100)

mTOR feature selection with random sampling

test.accuracies.mtor <- randomAccuracies(positives = dt.feature.selection.mTOR, 
                                        featureSpace = predictorsmTOR,
                                        sortedFeatures = featuresmTOR,
                                        iterations = 100)

plot the results for backwards selection

# avg the results 
test.accuracies.akt.avg <- colMeans(test.accuracies.akt)
test.accuracies.mtor.avg <- colMeans(test.accuracies.mtor)

akt plots

plot(rev(test.accuracies.akt.avg),type="l",ylim=c(0.8, 1), xaxt="n", ylab="Test accuracies", xlab="Number of features sorted",main="Feature selection vs model accuracy akt")
axis(1, at=seq(1, 16, by = 1), las=2)

mtor plots

plot(rev(test.accuracies.mtor.avg),type="l",ylim=c(0.5, 1), xaxt="n", ylab="Test accuracies", xlab="Number of features sorted",main="Feature selection vs model accuracy mtor")
axis(1, at=seq(1, 16, by = 1), las=2)

## feature selection results

All the features in both Akt & Mtor classifications yields impressive results. As such all features are important and cannot be dropped.

Let there be light - Learn more about the unlabelled dataset

The purpose here is to identify highly probable negatives for mTORs and Akts respectively so that an ensemble can be built more effectively.

Successful identificaiton of the negative samples will assist in removing bias and also solve the class imbalance problem.

helper functions/methods

library(MASS)

# helpers for ensemble/bagging

# function to build model 
model.func <- function (dt.model.full, dt.labeled, cost.n, degree.n, iterations, dt.pred) {

  # instantiate the last probability values 
  lastProbabilities <- rep(0, nrow(dt.model.full))
  iterationCors <- c()
  
  dt.model.full %>%
    mutate(Class = -1) %>%
    mutate(Class = replace(Class, Identifier %in% dt.labeled$V1, 1)) -> dt.model.full
  
  # data frame to store pred result
  dt.result <- dt.model.full[, c("Identifier", "Class")]
  
  for (i in 1:iterations) {
    
    dt.nolabel.sample <- dt.model.full[dt.model.full$Class==-1,][sample(nrow(dt.model.full[dt.model.full$Class==-1, ]), nrow(dt.labeled)),]
    
    dt.model <- rbind(dt.model.full[dt.model.full$Class==1,], dt.nolabel.sample)
    
    # build SVM classification model incl probability
    fit.model <- svm(Class ~ ., data=dt.model[, 2:ncol(dt.model)], kernel="radial", type="C-classification", cost=cost.n, degree=degree.n, decision.values = TRUE, probability=TRUE)
    
    pred <- predict(fit.model, dt.pred, decision.values = TRUE, probability = TRUE)
    
    currentProbabilities <- attr(pred, "probabilities")[,1] # attr(pred, "probabilities")[,1]
    dt.result[, ncol(dt.result) + 1] <- currentProbabilities
    names(dt.result)[ncol(dt.result)] <- paste0("model_", i)
    
    # calculate the correlations and store 
    currentCor <-  cor(currentProbabilities, lastProbabilities)
    iterationCors <- c(iterationCors, currentCor)  
    lastProbabilities <- currentProbabilities
  
  }
    
  dt.final <- data.frame(dt.result[,1:2],pred.Means=rowMeans(dt.result[,3:ncol(dt.result)]))
  names(dt.final)[colnames(dt.final)=="pred.Means"] <- "predictResult"
  
  # plot the result of the iterations 
  plot(iterationCors,type="l",#ylim=c(0, 2),
       ylab="correlation", 
       xlab="Number of iterations",
       main="Optimisation point for random sampling")
  
  return(dt.final)
  
}

# ensemble function (make final predictions)
model.adp2.func <- function (dt.model.full, dt.labeled, dt.neg, cost.n, degree.n, dt.pred) {

  dt.model.full %>%
    mutate(Class = -1) %>%
    mutate(Class = replace(Class, Identifier %in% dt.labeled$V1, 1)) -> dt.model.full

  dt.neg$Class = -1
  
  # data frame to store pred result
  dt.result <- dt.model.full[, c("Identifier", "Class")]

  for (i in 1:1000) {

    # generate data for model, combining positive labeled, and the same size selected from most likely negative from adaptive sampling result
    dt.model <- rbind(dt.model.full[dt.model.full$Class==1,], dt.neg[sample(nrow(dt.neg), nrow(dt.labeled)),])

    # build SVM classification model incl probability
    fit.model <- svm(Class ~ ., data=dt.model[, 2:ncol(dt.model)], kernel ="radial", type="C-classification", decision.values = TRUE, probability=TRUE, cost = cost.n, degree = degree.n)

    pred <- predict(fit.model, dt.pred, decision.values = TRUE, probability = TRUE)

    dt.result[, ncol(dt.result) + 1] <- attr(pred, "probabilities")[,1]
    names(dt.result)[ncol(dt.result)] <- paste0("model_", i)

  }
  dt.final <- data.frame(dt.result[,1:2],pred.Means=rowMeans(dt.result[,3:ncol(dt.result)]))
  names(dt.final)[colnames(dt.final)=="pred.Means"] <- "predictResult"

  return(dt.final)

}

function to analyze difference between 2016 predict result and our prediction result

diff.func <- function (type, dt.original, dt.prediction, title) {

# bootstrap sample statistic (difference in prediction probability)
B=1000  
dt.original.B <- rep(0,B)
dt.prediction.B <- rep(0,B)

for ( i in 1:B ) {
  dt.original.B[i] = mean(sample(dt.original[, grep(type, colnames(dt.original))], size = nrow(dt.original), replace = TRUE), na.rm = TRUE)
  dt.prediction.B[i] = mean(sample(dt.prediction[, grep(type, colnames(dt.prediction))], size = nrow(dt.prediction), replace = TRUE), na.rm = TRUE)
}

b.diff<-(dt.original.B-dt.prediction.B)*100

print(paste("95% Confidence Interval of Average(Bootstrap) difference in ", type, round(quantile(b.diff,0.05),2), round(quantile(b.diff,0.95),2)))

dt.delta <- merge(dt.original[, c(which(colnames(dt.original)=="Identifier"), grep(type, colnames(dt.original)))], dt.prediction[, c(which(colnames(dt.prediction)=="Identifier"), grep(type, colnames(dt.prediction)))], by=("Identifier"))

dt.delta$delta<-(dt.delta[,2]-dt.delta[,3])*100
#dt.delta<-dt.delta[complete.cases(dt.delta$delta),] 

ggplot(dat=dt.delta) + geom_histogram(aes(x=delta),bins=50) +ggtitle(paste0(title, " Distribution of prediction difference 2016 minus 2017"))

}

Create Highly probabble Negative subsets

akt

dt.Akt.fulmodel <- dt.Insulin[, c("Identifier", "X15s", "X30s", "X1m", "X2m", "X5m", "X10m", "X20m", "X60m", "Avg.Fold", "AUC", "Ins.1", "LY", "Ins.2", "MK", "AktMotif", "fitted.score")]
aktNancies <- model.func(dt.Akt.fulmodel, dt.Akt, 1.1, 1, 1000, dt.Akt.fulmodel[, 2:17]) # as per best tuning param val 
## Warning in cor(currentProbabilities, lastProbabilities): the standard
## deviation is zero

mtor

dt.mTOR.fulmodel <- dt.Insulin[, c("Identifier", "X15s", "X30s", "X1m", "X2m", "X5m", "X10m", "X20m", "X60m", "Avg.Fold", "AUC", "Ins.1", "LY", "Ins.2", "MK", "mTORMotif", "fitted.score")]
mTORNancies <- model.func(dt.mTOR.fulmodel, dt.mTOR, 1.1, 1, 1000, dt.mTOR.fulmodel[, 2:17])
## Warning in cor(currentProbabilities, lastProbabilities): the standard
## deviation is zero

dataset for ensemble

# akts 
aktNancies[aktNancies$Class==1, ]
##          Identifier Class predictResult
## 646     AKT1S1;318;     1     0.9542897
## 1215     AS160;324;     1     0.9689396
## 1219     AS160;595;     1     0.9495719
## 1226     AS160;649;     1     0.9527689
## 1469       BAD;136;     1     0.9449047
## 1913       BRF1;92;     1     0.9563046
## 2101    CARHSP1;53;     1     0.8288359
## 3760    ECNOS;1176;     1     0.9526236
## 3764      EDC3;161;     1     0.8909417
## 4632     FKHR2;252;     1     0.9731397
## 4705     FOXO1;316;     1     0.9799596
## 5155      GSK3A;21;     1     0.9786124
## 5159       GSK3B;9;     1     0.9718666
## 5751      IRS1;265;     1     0.9759471
## 5774      IRS1;522;     1     0.9497301
## 6873  KIAA1272;715;     1     0.9496068
## 9225    PFKFB2;469;     1     0.9497431
## 9226    PFKFB2;486;     1     0.9739178
## 9930     RANBP3;58;     1     0.9470495
## 11565    TSC2;1466;     1     0.9618744
## 11566     TSC2;939;     1     0.9748210
## 11567     TSC2;981;     1     0.9838358
aktNancies[aktNancies$Identifier %in% dt.mTOR$V1, ]
##          Identifier Class predictResult
## 504     AHNAK;5536;    -1    0.05942054
## 640     AKT1S1;255;    -1    0.26232923
## 761   ANKRD17;2041;    -1    0.08440179
## 904       APRF;727;    -1    0.07826836
## 3465    DEPDC6;265;    -1    0.04626077
## 3468    DEPDC6;293;    -1    0.24562685
## 3469    DEPDC6;299;    -1    0.04826489
## 3926   EIF4EBP1;36;    -1    0.07435220
## 3931   EIF4EBP1;64;    -1    0.07719166
## 3935   EIF4EBP2;37;    -1    0.17481102
## 3936   EIF4EBP2;46;    -1    0.14706185
## 4716     FRAP;2481;    -1    0.13432078
## 5123     GRB10;503;    -1    0.19877614
## 5780      IRS1;632;    -1    0.19456065
## 7640       MAF1;60;    -1    0.27295991
## 7641       MAF1;68;    -1    0.24959177
## 8839    NUMA1;1982;    -1    0.17141194
## 9006     PATL1;184;    -1    0.25393371
## 9007     PATL1;194;    -1    0.08706322
## 9955    RAPTOR;863;    -1    0.10394440
## 9956    RAPTOR;877;    -1    0.03634466
## 10344  RPS6KB1;427;    -1    0.40373478
## 10346  RPS6KB1;444;    -1    0.28905449
## 10348  RPS6KB1;452;    -1    0.38390202
## 11110 TBC1D10B;693;    -1    0.09100056
## 11684     ULK1;763;    -1    0.11154273
# negatives subset for ensemble step 
Akt.neg <- dt.Akt.fulmodel[which(dt.Akt.fulmodel$Identifier %in% 
                                   aktNancies[aktNancies$predictResult < 0.5, ]$Identifier 
                                 & !dt.Akt.fulmodel$Identifier %in% dt.Akt$V1), ]

# mtors 
mTORNancies[mTORNancies$Class==1, ]
##          Identifier Class predictResult
## 504     AHNAK;5536;     1     0.8978276
## 640     AKT1S1;255;     1     0.9278246
## 761   ANKRD17;2041;     1     0.9244890
## 904       APRF;727;     1     0.9184032
## 3465    DEPDC6;265;     1     0.9057565
## 3468    DEPDC6;293;     1     0.9356970
## 3469    DEPDC6;299;     1     0.7058537
## 3926   EIF4EBP1;36;     1     0.7689698
## 3931   EIF4EBP1;64;     1     0.9312038
## 3935   EIF4EBP2;37;     1     0.9096377
## 3936   EIF4EBP2;46;     1     0.8561459
## 4716     FRAP;2481;     1     0.8938383
## 5123     GRB10;503;     1     0.9752241
## 5780      IRS1;632;     1     0.9229491
## 7640       MAF1;60;     1     0.9485292
## 7641       MAF1;68;     1     0.9643523
## 8839    NUMA1;1982;     1     0.9212074
## 9006     PATL1;184;     1     0.9859447
## 9007     PATL1;194;     1     0.9125086
## 9955    RAPTOR;863;     1     0.9252619
## 9956    RAPTOR;877;     1     0.6953893
## 10344  RPS6KB1;427;     1     0.9214529
## 10346  RPS6KB1;444;     1     0.8957045
## 10348  RPS6KB1;452;     1     0.9215228
## 11110 TBC1D10B;693;     1     0.9433266
## 11684     ULK1;763;     1     0.9377368
mTORNancies[mTORNancies$Identifier %in% dt.Akt$V1, ]
##          Identifier Class predictResult
## 646     AKT1S1;318;    -1     0.5438617
## 1215     AS160;324;    -1     0.5125234
## 1219     AS160;595;    -1     0.5464693
## 1226     AS160;649;    -1     0.5384547
## 1469       BAD;136;    -1     0.4340877
## 1913       BRF1;92;    -1     0.4834763
## 2101    CARHSP1;53;    -1     0.2614398
## 3760    ECNOS;1176;    -1     0.5425490
## 3764      EDC3;161;    -1     0.3874161
## 4632     FKHR2;252;    -1     0.5385921
## 4705     FOXO1;316;    -1     0.4885224
## 5155      GSK3A;21;    -1     0.4602780
## 5159       GSK3B;9;    -1     0.5051196
## 5751      IRS1;265;    -1     0.5074499
## 5774      IRS1;522;    -1     0.5585685
## 6873  KIAA1272;715;    -1     0.5569316
## 9225    PFKFB2;469;    -1     0.5204383
## 9226    PFKFB2;486;    -1     0.4887327
## 9930     RANBP3;58;    -1     0.4842497
## 11565    TSC2;1466;    -1     0.4905927
## 11566     TSC2;939;    -1     0.4552222
## 11567     TSC2;981;    -1     0.4688654
# negative subset for ensemble step, due to high probability value of labeled Akt in mTOR model, we decided to set up threshold at 0.8 for negative class in mTOR model
mTOR.neg <- dt.mTOR.fulmodel[which(dt.mTOR.fulmodel$Identifier %in%
                                     mTORNancies[mTORNancies$predictResult 
                                                          < 0.8, ]$Identifier & 
                                     !dt.mTOR.fulmodel$Identifier %in% dt.mTOR$V1), ]

Final Predictions

Akt.adp2 <- model.adp2.func(dt.Akt.fulmodel, dt.Akt, Akt.neg, 1.1, 1, dt.Akt.fulmodel[, 2:17])
Akt.adp2[Akt.adp2$Class==1, ]
##          Identifier Class predictResult
## 646     AKT1S1;318;     1     0.9614144
## 1215     AS160;324;     1     0.9746824
## 1219     AS160;595;     1     0.9567079
## 1226     AS160;649;     1     0.9595746
## 1469       BAD;136;     1     0.9547010
## 1913       BRF1;92;     1     0.9648792
## 2101    CARHSP1;53;     1     0.8479057
## 3760    ECNOS;1176;     1     0.9595276
## 3764      EDC3;161;     1     0.9048182
## 4632     FKHR2;252;     1     0.9776503
## 4705     FOXO1;316;     1     0.9841488
## 5155      GSK3A;21;     1     0.9835935
## 5159       GSK3B;9;     1     0.9776482
## 5751      IRS1;265;     1     0.9805084
## 5774      IRS1;522;     1     0.9567083
## 6873  KIAA1272;715;     1     0.9567106
## 9225    PFKFB2;469;     1     0.9567090
## 9226    PFKFB2;486;     1     0.9794170
## 9930     RANBP3;58;     1     0.9569535
## 11565    TSC2;1466;     1     0.9704809
## 11566     TSC2;939;     1     0.9797748
## 11567     TSC2;981;     1     0.9874541
Akt.adp2[Akt.adp2$Identifier %in% dt.mTOR$V1, ]
##          Identifier Class predictResult
## 504     AHNAK;5536;    -1    0.06712881
## 640     AKT1S1;255;    -1    0.29352988
## 761   ANKRD17;2041;    -1    0.09196799
## 904       APRF;727;    -1    0.09202448
## 3465    DEPDC6;265;    -1    0.04832725
## 3468    DEPDC6;293;    -1    0.26932414
## 3469    DEPDC6;299;    -1    0.04865975
## 3926   EIF4EBP1;36;    -1    0.07723848
## 3931   EIF4EBP1;64;    -1    0.08462828
## 3935   EIF4EBP2;37;    -1    0.18916819
## 3936   EIF4EBP2;46;    -1    0.15615188
## 4716     FRAP;2481;    -1    0.14830071
## 5123     GRB10;503;    -1    0.22155091
## 5780      IRS1;632;    -1    0.22916041
## 7640       MAF1;60;    -1    0.31483816
## 7641       MAF1;68;    -1    0.28315141
## 8839    NUMA1;1982;    -1    0.18857993
## 9006     PATL1;184;    -1    0.28201796
## 9007     PATL1;194;    -1    0.10565903
## 9955    RAPTOR;863;    -1    0.12058494
## 9956    RAPTOR;877;    -1    0.03446233
## 10344  RPS6KB1;427;    -1    0.43448001
## 10346  RPS6KB1;444;    -1    0.32069051
## 10348  RPS6KB1;452;    -1    0.42111416
## 11110 TBC1D10B;693;    -1    0.10607723
## 11684     ULK1;763;    -1    0.12062006
mTOR.adp2 <- model.adp2.func(dt.mTOR.fulmodel, dt.mTOR, mTOR.neg, 1.1, 1, dt.mTOR.fulmodel[, 2:17])
mTOR.adp2[mTOR.adp2$Class==1, ]
##          Identifier Class predictResult
## 504     AHNAK;5536;     1     0.9289009
## 640     AKT1S1;255;     1     0.9474065
## 761   ANKRD17;2041;     1     0.9437604
## 904       APRF;727;     1     0.9409647
## 3465    DEPDC6;265;     1     0.9302519
## 3468    DEPDC6;293;     1     0.9547459
## 3469    DEPDC6;299;     1     0.7228846
## 3926   EIF4EBP1;36;     1     0.7955299
## 3931   EIF4EBP1;64;     1     0.9522370
## 3935   EIF4EBP2;37;     1     0.9300593
## 3936   EIF4EBP2;46;     1     0.8801363
## 4716     FRAP;2481;     1     0.9178426
## 5123     GRB10;503;     1     0.9871049
## 5780      IRS1;632;     1     0.9494586
## 7640       MAF1;60;     1     0.9669362
## 7641       MAF1;68;     1     0.9794232
## 8839    NUMA1;1982;     1     0.9418279
## 9006     PATL1;184;     1     0.9931401
## 9007     PATL1;194;     1     0.9397145
## 9955    RAPTOR;863;     1     0.9461211
## 9956    RAPTOR;877;     1     0.7034168
## 10344  RPS6KB1;427;     1     0.9417799
## 10346  RPS6KB1;444;     1     0.9243999
## 10348  RPS6KB1;452;     1     0.9418219
## 11110 TBC1D10B;693;     1     0.9630401
## 11684     ULK1;763;     1     0.9575721
mTOR.adp2[mTOR.adp2$Identifier %in% dt.Akt$V1, ]
##          Identifier Class predictResult
## 646     AKT1S1;318;    -1     0.5496129
## 1215     AS160;324;    -1     0.5129993
## 1219     AS160;595;    -1     0.5508560
## 1226     AS160;649;    -1     0.5417644
## 1469       BAD;136;    -1     0.4259434
## 1913       BRF1;92;    -1     0.4892282
## 2101    CARHSP1;53;    -1     0.2402407
## 3760    ECNOS;1176;    -1     0.5474039
## 3764      EDC3;161;    -1     0.3740810
## 4632     FKHR2;252;    -1     0.5423527
## 4705     FOXO1;316;    -1     0.4887895
## 5155      GSK3A;21;    -1     0.4592087
## 5159       GSK3B;9;    -1     0.5081475
## 5751      IRS1;265;    -1     0.5117899
## 5774      IRS1;522;    -1     0.5668681
## 6873  KIAA1272;715;    -1     0.5676430
## 9225    PFKFB2;469;    -1     0.5250197
## 9226    PFKFB2;486;    -1     0.4896158
## 9930     RANBP3;58;    -1     0.4866869
## 11565    TSC2;1466;    -1     0.4916748
## 11566     TSC2;939;    -1     0.4534282
## 11567     TSC2;981;    -1     0.4680266
print("Our final Akt prediction result with adaptive sampling technic is saved in data frame Akt.adp2")
## [1] "Our final Akt prediction result with adaptive sampling technic is saved in data frame Akt.adp2"
head(Akt.adp2)
##            Identifier Class predictResult
## 1       100043914;88;    -1    0.66712713
## 2       100043914;89;    -1    0.48211721
## 3   1110018J18RIK;18;    -1    0.04828794
## 4  1110037F02RIK;133;    -1    0.02528055
## 5  1110037F02RIK;138;    -1    0.01501212
## 6 1110037F02RIK;1628;    -1    0.01935558
print("Our final mTOR prediction result with adaptive sampling technic is saved in data frame mTOR.adp2")
## [1] "Our final mTOR prediction result with adaptive sampling technic is saved in data frame mTOR.adp2"
head(mTOR.adp2)
##            Identifier Class predictResult
## 1       100043914;88;    -1    0.49566130
## 2       100043914;89;    -1    0.60989320
## 3   1110018J18RIK;18;    -1    0.26519363
## 4  1110037F02RIK;133;    -1    0.04341526
## 5  1110037F02RIK;138;    -1    0.05378188
## 6 1110037F02RIK;1628;    -1    0.15526402

compares prediction diff

1. use bootstraping to calculate confidence interval of average difference

2. generates a plot to show distribution of difference

comparison with 2016

diff.func ("predictResult", df.Akt, Akt.adp2, "Akt Adp 2 Model - ")
## [1] "95% Confidence Interval of Average(Bootstrap) difference in  predictResult -0.78 -0.18"

diff.func ("predictResult", df.mTOR, mTOR.adp2, "mTOR Adp 2 Model - ")
## [1] "95% Confidence Interval of Average(Bootstrap) difference in  predictResult -1.92 -0.87"

# result without adaptive sampling 
diff.func ("predictResult", df.Akt, aktNancies, "Akt Full Model - ")
## [1] "95% Confidence Interval of Average(Bootstrap) difference in  predictResult -0.33 0.25"

diff.func ("predictResult", df.mTOR, mTORNancies, "mTOR Full Model - ")
## [1] "95% Confidence Interval of Average(Bootstrap) difference in  predictResult -2.59 -1.64"

using simulation and/or create a conservative lower and/or upper bound for performance

# summarize each feature
feat.list <- dt.Insulin[, c("X15s", "X30s", "X1m", "X2m", "X5m", "X10m", "X20m", "X60m", "Avg.Fold", "AUC", "Ins.1", "LY", "Ins.2", "MK", "AktMotif", "mTORMotif", "fitted.score")]
feat.summary <- sapply(feat.list, summary)
feat.sd <- summarise_all(feat.list, funs(sd))
row.names(feat.sd) <- "sd"

feat.summary <- rbind(feat.summary, feat.sd)
#View(feat.summary)

# plot each feature
par(mfrow=c(2, 2))

for (i in 1:ncol(feat.list)) {
  
  x <- feat.list[, i]
  h<-hist(x, breaks=30, col="red", xlab=colnames(feat.summary)[i], main=paste0("Histogram with Normal Curve - ", colnames(feat.summary)[i]))
  xfit<-seq(min(x),max(x),length=40)
  yfit<-dnorm(xfit,mean=mean(x),sd=sd(x))
  yfit <- yfit*diff(h$mids[1:2])*length(x)
  lines(xfit, yfit, col="blue", lwd=2)
}

print("We can see all features are normally distributed, we can generate simulation data set based on distribution and summary of each feature")
## [1] "We can see all features are normally distributed, we can generate simulation data set based on distribution and summary of each feature"
# generate simulation data 
dt.simulation.func <- function(dt){
  
  fullset <- NULL
  Identifier <- seq.int(1, 100000, 1)
  fullset <- cbind(fullset, Identifier)
  
  for (i in 1:ncol(dt)) {
    set.seed(123)
    x <- rnorm(n=100000, m=dt[c("Mean"), i], sd=dt[c("sd"), i])
    col <- x[x>=dt[c("Min."), i] & x<=dt[c("Max."), i]] 
    fullset <- cbind(fullset, col)
  }
  return(fullset)
}

dt.sim <- dt.simulation.func(feat.summary)
## Warning in .Method(..., deparse.level = deparse.level): number of rows of
## result is not a multiple of vector length (arg 2)

## Warning in .Method(..., deparse.level = deparse.level): number of rows of
## result is not a multiple of vector length (arg 2)

## Warning in .Method(..., deparse.level = deparse.level): number of rows of
## result is not a multiple of vector length (arg 2)

## Warning in .Method(..., deparse.level = deparse.level): number of rows of
## result is not a multiple of vector length (arg 2)

## Warning in .Method(..., deparse.level = deparse.level): number of rows of
## result is not a multiple of vector length (arg 2)
dt.sim <- data.frame(dt.sim)
colnames(dt.sim) <- c("Identifier", "X15s", "X30s", "X1m", "X2m", "X5m", "X10m", "X20m", "X60m", "Avg.Fold", "AUC", "Ins.1", "LY", "Ins.2", "MK", "AktMotif", "mTORMotif", "fitted.score")
#class(dt.sim)
#sapply(dt.sim, class)
#View(dt.sim)
#ncol(dt.sim)
#ncol(feat.summary)

# check if any missing value in simulation data set 
sum(!complete.cases(dt.sim))
## [1] 0
dt.simulation.Akt <- dt.sim[sample(1:nrow(dt.sim), size = nrow(dt.Insulin)), -which(names(dt.sim) == "mTORMotif")]
dt.simulation.mTOR <- dt.sim[sample(1:nrow(dt.sim), size = nrow(dt.Insulin)), -which(names(dt.sim) == "AktMotif")]

# final prediction for simulated data 
Akt.sim.adp2 <- model.adp2.func(dt.Akt.fulmodel, dt.Akt, Akt.neg, 1.1, 1, dt.simulation.Akt )

mTOR.sim.adp2 <- model.adp2.func(dt.mTOR.fulmodel, dt.mTOR, mTOR.neg, 1.1, 1, dt.simulation.mTOR)

# plot given insulin data prob and simulation data prob for Akt 
par(mfrow=c(1, 2))

x <- Akt.sim.adp2[, c("predictResult")]
h<-hist(x, breaks=30, col="red", xlab="Predict Result", main="Akt Simulation Data")
xfit<-seq(min(x),max(x),length=40)
yfit<-dnorm(xfit,mean=mean(x),sd=sd(x))
yfit <- yfit*diff(h$mids[1:2])*length(x)
lines(xfit, yfit, col="blue", lwd=2)

x <- Akt.adp2[, c("predictResult")]
h<-hist(x, breaks=30, col="red", xlab="Predict Result", main="Akt Given Data")
xfit<-seq(min(x),max(x),length=40)
yfit<-dnorm(xfit,mean=mean(x),sd=sd(x))
yfit <- yfit*diff(h$mids[1:2])*length(x)
lines(xfit, yfit, col="blue", lwd=2)

print("From the plot, we can see Akt prediction results for both simulation data and actual data are following the similar distribution curve ")
## [1] "From the plot, we can see Akt prediction results for both simulation data and actual data are following the similar distribution curve "
# plot given insulin data prob and simulation data prob for mTOR 
par(mfrow=c(1, 2))
x <- mTOR.sim.adp2[, c("predictResult")]
h<-hist(x, breaks=30, col="red", xlab="Predict Result", main="mTOR Simulation Data")
xfit<-seq(min(x),max(x),length=40)
yfit<-dnorm(xfit,mean=mean(x),sd=sd(x))
yfit <- yfit*diff(h$mids[1:2])*length(x)
lines(xfit, yfit, col="blue", lwd=2)

x <- mTOR.adp2[, c("predictResult")]
h<-hist(x, breaks=30, col="red", xlab="Predict Result", main="mTOR Given Data")
xfit<-seq(min(x),max(x),length=40)
yfit<-dnorm(xfit,mean=mean(x),sd=sd(x))
yfit <- yfit*diff(h$mids[1:2])*length(x)
lines(xfit, yfit, col="blue", lwd=2)

print("From the plot, we can see mTOR prediction results for both simulation data and actual data are following the similar distribution curve ")
## [1] "From the plot, we can see mTOR prediction results for both simulation data and actual data are following the similar distribution curve "

Output session information

sessionInfo()
## R version 3.4.1 (2017-06-30)
## Platform: x86_64-apple-darwin16.6.0 (64-bit)
## Running under: macOS Sierra 10.12.6
## 
## Matrix products: default
## BLAS: /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib
## LAPACK: /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libLAPACK.dylib
## 
## locale:
## [1] en_AU.UTF-8/en_AU.UTF-8/en_AU.UTF-8/C/en_AU.UTF-8/en_AU.UTF-8
## 
## attached base packages:
##  [1] grid      stats4    parallel  splines   stats     graphics  grDevices
##  [8] utils     datasets  methods   base     
## 
## other attached packages:
##  [1] MASS_7.3-47         kernlab_0.9-25      pROC_1.10.0        
##  [4] seqLogo_1.42.0      RColorBrewer_1.1-2  bindrcpp_0.2       
##  [7] kknn_1.3.1          Biostrings_2.44.2   XVector_0.16.0     
## [10] IRanges_2.10.5      S4Vectors_0.14.7    BiocGenerics_0.22.1
## [13] mlbench_2.1-1       tidyr_0.7.1         stringr_1.2.0      
## [16] scales_0.4.1        gbm_2.1.3           survival_2.41-3    
## [19] e1071_1.6-8         class_7.3-14        dplyr_0.7.2        
## [22] plyr_1.8.4          caret_6.0-77        ggplot2_2.2.1      
## [25] lattice_0.20-35     mclust_5.3         
## 
## loaded via a namespace (and not attached):
##  [1] ddalpha_1.3.1      sfsmisc_1.1-1      foreach_1.4.3     
##  [4] prodlim_1.6.1      assertthat_0.2.0   DRR_0.0.2         
##  [7] yaml_2.1.14        robustbase_0.92-7  ipred_0.9-6       
## [10] backports_1.1.0    glue_1.1.1         digest_0.6.12     
## [13] colorspace_1.3-2   recipes_0.1.0      htmltools_0.3.6   
## [16] Matrix_1.2-10      timeDate_3012.100  pkgconfig_2.0.1   
## [19] CVST_0.2-1         zlibbioc_1.22.0    purrr_0.2.4       
## [22] gower_0.1.2        lava_1.5.1         tibble_1.3.4      
## [25] withr_2.0.0        nnet_7.3-12        lazyeval_0.2.0    
## [28] magrittr_1.5       evaluate_0.10.1    nlme_3.1-131      
## [31] dimRed_0.1.0       tools_3.4.1        munsell_0.4.3     
## [34] compiler_3.4.1     RcppRoll_0.2.2     rlang_0.1.1       
## [37] iterators_1.0.8    igraph_1.1.2       labeling_0.3      
## [40] rmarkdown_1.6      gtable_0.2.0       ModelMetrics_1.1.0
## [43] codetools_0.2-15   reshape2_1.4.2     R6_2.2.2          
## [46] lubridate_1.6.0    knitr_1.17         bindr_0.1         
## [49] rprojroot_1.2      stringi_1.1.5      Rcpp_0.12.12      
## [52] rpart_4.1-11       DEoptimR_1.0-8